# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
### Reducing Prejudice and Support for Religious 
### Nationalism Through Conversations on WhatsApp 

### Author: Rajeshwari Majumdar
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# File:     supp_participation.R
# Purpose:  analyze various measures related to participation in the experiment 
#        1. selection into the experiment (Appendix C.1);
#        2. survey response rates (Appendix C.2);
#        3. volume of conversation (Appendix C.3);  
#        4. during-conversation attrition/differential attrition (Appendix D.3)
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# %%%%%%%%%%%%%%%%%%%%%%%%%
# C.1 Who Participates ----
# %%%%%%%%%%%%%%%%%%%%%%%%%

## Load data 
rm(list = ls())
load("data/Recruits_Anonymized.RData") 
# dataset: everyone from the recruitment survey; "cf2drop" indicates whether one chose to sign up

## Check signup rate
tabyl(d$cf2drop)

## Variables to compare signups and drops on
d$religion_Muslim = ifelse(d$religion=="Muslim",1,0)
vars.forall = c("religion_Muslim","age","gender_f","college","married","socialapps_WhatsApp")
vars.forhindus = c("castefw","supportBJP","feelth_muslim","hasfriend_muslim")
vars.formuslims = c("castefw","supportBJP","feelth_hindu","hasfriend_hindu")

## Table C.1: Demographics of those who signed up vs. those who did not ----
mean_signedup = numeric() 
mean_dropped = numeric()
diff = numeric()
pval = numeric()
for (i in 1:length(vars.forall)){
  tdata = t.test(d[d$cf2drop==0, vars.forall[i]],
                 d[d$cf2drop==1, vars.forall[i]]
  )
  mean_signedup[i] = tdata$estimate[1]
  mean_dropped[i] = tdata$estimate[2]
  diff[i] = abs(mean_signedup[i] - mean_dropped[i])
  pval[i] = tdata$p.value
}
out = as.data.frame(cbind(vars.forall, mean_signedup, mean_dropped, diff, pval))
out[,2:ncol(out)] = apply(out[,2:ncol(out)], 2, as.numeric)
out[,2:ncol(out)] = round(out[,2:ncol(out)], 2)                    
out

## Table C.2: Group-based characteristics of Hindus who signed up vs. those who did not ----
dh = d[d$religion=="Hindu",]
mean_signedup = numeric()
mean_dropped = numeric()
diff = numeric()
pval = numeric()
for (i in 1:length(vars.forhindus)){
  tdata = t.test(dh[dh$cf2drop==0, vars.forhindus[i]],
                 dh[dh$cf2drop==1, vars.forhindus[i]])
  mean_signedup[i] = tdata$estimate[1]
  mean_dropped[i] = tdata$estimate[2]
  diff[i] = abs(mean_signedup[i] - mean_dropped[i])
  pval[i] = tdata$p.value
}
out.hindus = as.data.frame(cbind(vars.forhindus, mean_signedup, mean_dropped, diff, pval))
out.hindus[,2:ncol(out.hindus)] = apply(out.hindus[,2:ncol(out.hindus)], 2, as.numeric)
out.hindus[,2:ncol(out.hindus)] = round(out.hindus[,2:ncol(out.hindus)], 2)                    
out.hindus

## Table C.3: Group-based characteristics of Muslims who signed up vs. those who did not ---- 
dm = d[d$religion=="Muslim",]
mean_signedup = numeric()
mean_dropped = numeric()
diff = numeric()
pval = numeric()
for (i in 1:length(vars.formuslims)){
  tdata = t.test(dm[dm$cf2drop==0, vars.formuslims[i]],
                 dm[dm$cf2drop==1, vars.formuslims[i]])
  mean_signedup[i] = tdata$estimate[1]
  mean_dropped[i] = tdata$estimate[2]
  diff[i] = abs(mean_signedup[i] - mean_dropped[i])
  pval[i] = tdata$p.value
}
out.muslims = as.data.frame(cbind(vars.formuslims, mean_signedup, mean_dropped, diff, pval))
out.muslims[,2:ncol(out.muslims)] = apply(out.muslims[,2:ncol(out.muslims)], 2, as.numeric)
out.muslims[,2:ncol(out.muslims)] = round(out.muslims[,2:ncol(out.muslims)], 2)                    
out.muslims

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# C.2 Survey Response Rates ----
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

## Load data 
rm(list = ls())
load("data/Completes_Merged_Anonymized.RData")

## Table C.4: List of surveys with response rates ----
survey.completed.vars = paste("s",4:13,".completed",sep="")
survey.completed.props = d %>% select(all_of(survey.completed.vars)) %>% summarise(across(everything(), ~ sum(.x == TRUE, na.rm = TRUE) / n())) 
survey.completed.props

## overall response rate across surveys 
summary(unlist(survey.completed.props))

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# C.4 Conversation characteristics ----
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 

## Load data 
rm(list=ls())
load("data/Completes_Merged_Anonymized.RData")

## Summary statistics for word count and message count 
summary(d$chat_wordcount_total)
summary(d$chat_msgcount_total)

## Figure C.1: Number of words exchanged by topic condition ----

### Panel (a) All observations included (N = 535)  
df_wordcount = d %>% 
  group_by(condition_topic, condition_pair) %>%
  summarize(N = length(na.omit(chat_wordcount_total)), 
            chat_words_mean = mean(chat_wordcount_total, na.rm=TRUE),
            chat_words_sd = sd(chat_wordcount_total, na.rm=TRUE))
df_wordcount$chat_words_se = df_wordcount$chat_words_sd / sqrt(df_wordcount$N)  

### Panel (b) Outliers excluded (N = 506)
lower_bound = as.numeric(summary(d$chat_wordcount_total)[2])-(1.5*IQR(d$chat_wordcount_total)) 
upper_bound = as.numeric(summary(d$chat_wordcount_total)[5])+(1.5*IQR(d$chat_wordcount_total)) 
x = which(d$chat_wordcount_total < lower_bound | d$chat_wordcount_total > upper_bound)
d_trunc = d[-x,]
df_wordcount = d_trunc %>% 
  group_by(condition_topic, condition_pair) %>%
  summarize(N = length(na.omit(chat_wordcount_total)), 
            chat_words_mean = mean(chat_wordcount_total, na.rm=TRUE),
            chat_words_sd = sd(chat_wordcount_total, na.rm=TRUE))
df_wordcount$chat_words_se = df_wordcount$chat_words_sd / sqrt(df_wordcount$N)

### Plot
ggplot(df_wordcount, aes(x = condition_topic, y = chat_words_mean, fill=condition_pair)) + 
  geom_bar(position="dodge", stat="identity") +
  geom_errorbar(aes(ymin  = chat_words_mean - chat_words_se,
                    ymax  = chat_words_mean + chat_words_se),
                width = 0.2,
                size  = 1,
                position = position_dodge(.9)) + 
  ylim(0,3000) +
  xlab("Conversation Topic") + ylab("Average word count over five days") +
  scale_x_discrete(labels = c("Non-pol.","General pol.","IGR")) +
  scale_fill_manual(values=c("blue", "brown"), 
                    name="Pair Type:",
                    labels=c("Hindu-Hindu", "Hindu-Muslim")) +
  theme_bw() + theme_minimal() 


# Note #
# Figures C.2 and C.3 draw directly on WhatsApp chat transcripts, 
# which cannot be published given privacy considerations. 


# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
# D.3 Differential Attrition ----
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 

## Load data
rm(list = ls())
load("data/Starts_Anonymized.RData") 
# dataset: every conversation thread that was started;
# "completed" indicates whether the thread made it to the end;
# unit of analysis is participant, so two observations for each idthread;
# at least one observation per participant; 
# multiple observations for participants who were reassigned after a failed thread

## Check attrition rates
d %>% tabyl(completed)
d %>% tabyl(condition_pair, completed) %>% adorn_percentages("row") %>% adorn_ns()
d %>% tabyl(condition_topic, completed) %>% adorn_percentages("row") %>% adorn_ns()

## Table D.4: Correlates of attrition, by partner religion condition ----
attrition.all = lm(completed ~ condition_topic + age + gender_f + socialapps_WhatsApp + castefw + supportBJP + feelth_muslim + hasfriend_muslim, 
                  data = d)
attrition.HH = lm(completed ~ condition_topic +  age + gender_f + socialapps_WhatsApp + castefw + supportBJP + feelth_muslim + hasfriend_muslim, 
                 data = subset(d, condition_pair=="HH"))
attrition.HM = lm(completed ~ condition_topic + age + gender_f + socialapps_WhatsApp + castefw + supportBJP + feelth_muslim + hasfriend_muslim, 
                 data = subset(d, condition_pair=="HM"))

modelsummary(list(attrition.all, attrition.HH, attrition.HM),
             coef_map = c(
               "condition_topicgen" = "Topic: Gen. pol.",
               "condition_topicigr" = "Topic: IGR",
               "prejudice_binary" = "Baseline prejudice",
               "age" = "Age",
               "gender_f" = "Female",
               "socialapps_WhatsAppTRUE" = "Freq. WhatsApp User",
               "castefw" = "Forward Caste",
               "supportBJP" = "BJP Support",
               "feelth_muslim" = "Feel About Muslims",
               "hasfriend_muslim" = "Has Muslim Friend",
               "(Intercept)" = "Constant"
             ),
             stars = c(`***` = 0.01, `**` = 0.05, `*` = 0.10),
             gof_map = "nobs"
             )


## Table D.5: IPW estimates ----
# following Gomila and Clark (2022)
dh = subset(d, religion == "Hindu")
dh$treatment = dh$condition_pair
depvars = c("post.feelth_muslim","follow.feelth_muslim",
            "twt_HinduNation_q1","twt_Hijab_q1","twt_HinduWeap_q1","twt_Terrorism_q1")
depvars = which(names(dh) %in% depvars)

mod = list()
for (i in 1:length(depvars)){
  dh$depvar = dh[,depvars[i]]
  dh$response = as.numeric(!is.na(dh$depvar))
  fit_p_resp = glm(response ~ age * treatment + 
                      gender_f * treatment + 
                      prejudice_binary * treatment,
                    family = binomial(link = "logit"),
                    data = dh)
  p_resp = fit_p_resp$fitted
  gen_weights = 1/p_resp
  fit_ipw = lm(depvar ~ treatment + age + gender_f + prejudice_binary,
                weights = gen_weights,
                data = dh)
  mod[[i]] = fit_ipw
}

modelsummary(mod,
             coef_map = c(
               "treatmentHM" = "Hindu-Muslim pair",
               "age" = "Age",
               "gender_f" = "Female",
               "prejudice_binary" = "Baseline prejudice",
               "(Intercept)" = "Constant"
             ),
             stars = c(`***` = 0.01, `**` = 0.05, `*` = 0.10),
             gof_map = "nobs"
             )


## Table D.6 Imputing against-hypothesis values (outcome: short-term feelings about Muslims) ----
### Consider only the first appearance for everyone
dh1 = d[d$religion=="Hindu" & d$appearance_number==1,]
tabyl(dh1$completed)
tabyl(dh1$post.feelth_muslim)

### Most extreme: Hindus who spoke to a Muslim assigned a value of 1 
dh1$post.feelth_muslim.imp = ifelse(is.na(dh1$post.feelth_muslim)==FALSE, dh1$post.feelth_muslim,
                                     ifelse(dh1$condition_pair=="HM", 1, dh1$feelth_muslim))
summary(lm(post.feelth_muslim.imp ~ condition_pair, data = dh1))

### Still extreme, but less so: Hindus who spoke to a Muslim assigned a value of 2 
dh1$post.feelth_muslim.imp = ifelse(is.na(dh1$post.feelth_muslim)==FALSE, dh1$post.feelth_muslim,
                                    ifelse(dh1$condition_pair=="HM", 2, dh1$feelth_muslim))
summary(lm(post.feelth_muslim.imp ~ condition_pair, data = dh1))

### Then: Hindus who spoke to a Muslim assigned a value of 3 (also equal to the median post-treatment value in HH pairs)
median(na.omit(dh$post.feelth_muslim[dh$condition_pair=="HH"]))
dh1$post.feelth_muslim.imp = ifelse(is.na(dh1$post.feelth_muslim)==FALSE, dh1$post.feelth_muslim,
                                    ifelse(dh1$condition_pair=="HM", 3, dh1$feelth_muslim))
summary(lm(post.feelth_muslim.imp ~ condition_pair, data = dh1))

### Finally: give everyone in the dataset their pre-treatment feeling
dh1$post.feelth_muslim.imp = ifelse(is.na(dh1$post.feelth_muslim)==FALSE, dh1$post.feelth_muslim,
                                    dh1$feelth_muslim)
summary(lm(post.feelth_muslim.imp ~ condition_pair, data = dh1))


## Figure D.1: Sensitivity of imputing against-hypothesis responses (outcome: short-term feelings about Muslims) ----
### Function to impute outcome for k = 1,..,N randomly selected attritors
bnd = function(k){
  toimpute = which(dh1$completed==0)
  
  # randomly select k attritors to impute against-hypothesis values
  toimpute_against = sample(toimpute, k)
  
  # remaining attritors' outcome value will be set to their pre-treatment feeling
  toimpute_nochange = setdiff(toimpute, toimpute_against)
  
  for (i in 1:length(toimpute_against)){
    dh1[toimpute_against[i], "post.feelth_muslim"] = ifelse(dh1[toimpute_against[i], "condition_pair"] == "HH", 
                                                            dh1[toimpute_against[i], "feelth_muslim"], 
                                                               2)
  }
  
  for (i in 1:length(toimpute_nochange)){
    dh1[toimpute_nochange[i], "post.feelth_muslim"] = dh1[toimpute_nochange[i], "feelth_muslim"]
  }
  
  # estimate treatment effects using standard specification 
  feelth_reg = lm(post.feelth_muslim ~ condition_pair + prejudice_binary, data = dh1)
  
  return(cbind(
    coef(summary(feelth_reg))[2,1],
    coef(summary(feelth_reg))[2,4]
  )
  )
}

### Loop to do this for 1,000 samples of size k and extract coefficient estimates & p-values
# (might take some time!)
kvctr = 1:273
j = 0
coefs.pvs.all = numeric()
while (j<1000){
  bnds = function(kvctr){
    coef = numeric()
    pv = numeric()
    for (i in 1:length(kvctr)){
      res = bnd(kvctr[i])
      coef[i] = res[1]
      pv[i] = res[2]
    }
    return(cbind(kvctr,coef,pv))
  }
  coefs.pvs.all = rbind(coefs.pvs.all, bnds(kvctr))
  j=j+1
}
coefs.pvs.all = as.data.frame(coefs.pvs.all)

### Plot: for each k, show proportion of significant p-values; overlay average coefficient estimate
coefs = coefs.pvs.all %>% group_by(kvctr) %>% summarise(mean(coef))
names(coefs) = c("k","avg")
alph0.01 = numeric()
alph0.05 = numeric()
alph0.10 = numeric()
for (i in 1:length(kvctr)){
  corresp.pvalues = coefs.pvs.all[which(coefs.pvs.all[,1] == kvctr[i]), 3]
  alph0.01[i] = length(which(corresp.pvalues<0.01)) / j
  alph0.05[i] = length(which(corresp.pvalues<0.05)) / j
  alph0.10[i] = length(which(corresp.pvalues<0.10)) / j
}
results = as.data.frame(cbind(kvctr, alph0.10, alph0.05, alph0.01))
names(results) = c("k","0.10","0.05","0.01")
results = reshape2::melt(results, id=c("k"))

ggplot() + 
  geom_line(data = results, aes(x = k, y = value, color = variable, linetype = variable)) +
  geom_line(data = coefs, aes(x = k, y = avg)) +
  scale_x_continuous(limits=c(1,275), breaks=seq(0,275,25))
